In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re, json, time
from matplotlib_venn import venn2, venn3
from scipy import stats
from utility_functions import *
DATE
Out[1]:
'20250630'

Sankey plot¶

In [2]:
from pySankey.sankey import sankey as sankeyplot
import plotly.graph_objects as go
In [3]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
filtering = 'global'
# filtering = 'hybrid'
data = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
    data.append(pd.read_csv(os.path.join(working_folder, dataset_name,
                                         f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}-outerjoin.csv.gz')))
for _ in data:
    print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)

F, counts = make_sankey_plot_with_counts(data, suffixes=['_closedsearch','_opensearch'])
F.write_image(f"publication-data/{DATE}-Sankey-closedsearch-opensearch-{filtering}-filtering.svg")
F.show()
(128800, 61)
(46757, 61)
(165822, 61)
(341379, 61)
In [4]:
# searches overall overlap
closed_search_identified_spectra = data[~data.database_closedsearch.isna()].spectrum_title
open_search_identified_spectra   = data[~data.database_opensearch.isna()].spectrum_title
open_search_identified_spectra
venn2([set(closed_search_identified_spectra),set(open_search_identified_spectra)], 
      set_labels=['Closed search','Open search'],
      set_colors=sns.color_palette("Set2")[:2])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-closedsearch-opensearch-{filtering}-filtering.svg")
No description has been provided for this image
In [5]:
len(set(open_search_identified_spectra))/len(set(closed_search_identified_spectra))
Out[5]:
1.597610958579414

In [6]:
# F = make_sankey_plot_with_counts(data, sankey_labels)
# F.write_image("publication-data/{DATE}-Sankey-plot-closedsearch-opensearch.svg")
# F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','NonCanonical+Unmodified/Expected',
                   'Decoy','Unidentified'],
                ['Canonical+Unmodified/Expected','Canonical+Unexpected',
                 'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
                 'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_closedsearch            
Canonical+Unmodified/Expected 161808 17531 520 1367 712 16104
NonCanonical+Unmodified/Expected 236 605 674 185 33 594
Decoy 254 482 24 48 179 1152
Unidentified 67965 62394 2682 3294 2536 0

In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
tmp
341379
Out[8]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 520 1367 712 16104
NonCanonical+Unmodified/Expected 236 605 674 185 33 594
Decoy 254 482 24 48 179 1152
Unidentified 67965 62394 2682 3294 2536 0
In [9]:
# Any --> Any
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
184658
54.1%
Out[9]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 520 1367 712
NonCanonical+Unmodified/Expected 236 605 674 185 33
Decoy 254 482 24 48 179
In [10]:
# Any --> Any
tmp = data3.iloc[:,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
323529
94.8%
Out[10]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 520 1367 712
NonCanonical+Unmodified/Expected 236 605 674 185 33
Decoy 254 482 24 48 179
Unidentified 67965 62394 2682 3294 2536
In [11]:
# Identified by closed search
tmp = data3.iloc[:-1,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
202508
59.3%
Out[11]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 520 1367 712 16104
NonCanonical+Unmodified/Expected 236 605 674 185 33 594
Decoy 254 482 24 48 179 1152
In [12]:
# Expected --> Expected ptm
tmp = data3.iloc[:2,[0,2]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
163238
47.8%
Out[12]:
sankey_opensearch Canonical+Unmodified/Expected NonCanonical+Unmodified/Expected
sankey_closedsearch
Canonical+Unmodified/Expected 161808 520
NonCanonical+Unmodified/Expected 236 674
In [13]:
# Expected --> Unexpected ptm
tmp = data3.iloc[:2,[1,3]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
19688
5.8%
Out[13]:
sankey_opensearch Canonical+Unexpected NonCanonical+Unexpected
sankey_closedsearch
Canonical+Unmodified/Expected 17531 1367
NonCanonical+Unmodified/Expected 605 185
In [14]:
# Unidentified --> Any ID
tmp = data3.iloc[-1,:-2]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
136335
Out[14]:
sankey_opensearch
Canonical+Unmodified/Expected       0.50
Canonical+Unexpected                0.46
NonCanonical+Unmodified/Expected    0.02
NonCanonical+Unexpected             0.02
Name: Unidentified, dtype: float64
In [15]:
# NonCanon --> Any
tmp = data3.iloc[1,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
2327
Out[15]:
sankey_opensearch
Canonical+Unmodified/Expected       0.10
Canonical+Unexpected                0.26
NonCanonical+Unmodified/Expected    0.29
NonCanonical+Unexpected             0.08
Decoy                               0.01
Unidentified                        0.26
Name: NonCanonical+Unmodified/Expected, dtype: float64
In [16]:
# Canon --> Any
tmp = data3.iloc[0,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),3)
198042
Out[16]:
sankey_opensearch
Canonical+Unmodified/Expected       0.817
Canonical+Unexpected                0.089
NonCanonical+Unmodified/Expected    0.003
NonCanonical+Unexpected             0.007
Decoy                               0.004
Unidentified                        0.081
Name: Canonical+Unmodified/Expected, dtype: float64

Compare by-correlation of spectra identified in both searches (closed & open)¶

In [17]:
combo = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
    tmp = pd.read_csv(os.path.join(working_folder, dataset_name,
                                   f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}.csv.gz'))
    print(tmp.shape)
    combo.append(tmp)
    del tmp
combo = pd.concat(combo, ignore_index=True)
print(combo.shape)
combo.tail()
(83173, 61)
(19200, 61)
(82285, 61)
(184658, 61)
Out[17]:
spectrum_title scan spectrum_file precursor_mass_closedsearch database_peptide_closedsearch matched_peptide_closedsearch modifications_closedsearch database_closedsearch psm_score_closedsearch global_q_closedsearch ... by-explained_opensearch b-explained_opensearch y-explained_opensearch all-explained_opensearch by-intensity-pattern-correlation_opensearch top_tag_rank_nterm_opensearch top_tag_rank_cterm_opensearch top_tag_rank_opensearch predicted_retention_time_opensearch retention_time_error_adjusted_opensearch
184653 AM9:controllerType=0 controllerNumber=1 scan=1... 14152 AM9 1936.986225 ETFLRELISNSSDALDK ETFLRELISNSSDALDK Unmodified T 0.237943 0.003281 ... 0.1387 673 714 0.1582 0.3885 101 0 0 2154.488398 337.301222
184654 AM9:controllerType=0 controllerNumber=1 scan=5242 5242 AM9 984.536291 VASSQQLPR VASSQQLPR Unmodified T 0.196643 0.003818 ... 0.1911 232 1679 0.2505 0.9060 101 71 71 541.705442 785.104618
184655 AM9:controllerType=0 controllerNumber=1 scan=2... 22169 AM9 2372.145727 KDLYTNTVLSGGTTMYPGIADR KDLYTNTVLSGGTTMYPGIADR Unmodified T 0.181762 0.004363 ... 0.3967 897 3070 0.4156 0.7955 54 2 2 3826.140377 51.898757
184656 AM9:controllerType=0 controllerNumber=1 scan=1... 18614 AM9 2085.085425 QPGPHIIYRCPSALQHIR QPGPHIIYRCPSALQHIR Unmodified T 0.028571 0.008921 ... 0.1203 444 759 0.1299 0.8856 101 0 0 3141.328678 94.136818
184657 AM9:controllerType=0 controllerNumber=1 scan=1... 17788 AM9 1684.761083 SGFATNAASAGGYCRPR SGFATNAASAGGYCRPR Unmodified D 0.001249 0.009685 ... 0.2333 150 2183 0.2448 0.9301 25 9 9 3366.782703 421.956723

5 rows × 61 columns

In [18]:
combo["Same_peptide"]=combo.matched_peptide_closedsearch==combo.matched_peptide_opensearch
combo["Same_mod_peptide"]=combo.modified_peptide_closedsearch==combo.modified_peptide_opensearch
combo["Same_mods_noRT"]=(combo.matched_peptide_closedsearch+combo.modifications_noRT_closedsearch)==(combo.matched_peptide_opensearch+combo.modifications_noRT_opensearch)

def samepep_and_samemod(row):
    if row.Same_peptide and row.Same_mods_noRT:
        return 'Same_peptidoform'
    elif row.Same_peptide and not row.Same_mods_noRT:
        return 'Positional_isoform'
    elif not row.Same_peptide and not row.Same_mods_noRT:
        return 'Different_peptide'
combo['samepep_samemod'] = combo.apply(samepep_and_samemod, axis=1)
# print(combo['samepep_samemod'].value_counts())

VAR, VAL = 'samepep_samemod','by-intensity-pattern-correlation'
for i,df in combo.groupby('samepep_samemod').__iter__():
    _,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
                                df[f'{VAL}_opensearch'])
    effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
    print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')

ORDER = ['Same_peptidoform','Positional_isoform','Different_peptide']
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','samepep_samemod'], 
                           value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'], 
                           var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x='samepep_samemod', y='value', hue='search', aspect=1.25, showfliers=False, notch=True, 
            palette='Set2', order=ORDER)
plt.title('Intensity correlation across searches')
plt.ylabel('by-intensity-pattern-correlation')
plt.xlabel(None)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-1-{filtering}.svg")
Different_peptide: p=2.3e-124 True (n=15581) effect size = 0.228
Positional_isoform: p=4.7e-02 True (n=14118) effect size = 0.032
Same_peptidoform: p=9.4e-01 False (n=154959) effect size = 0.000
No description has been provided for this image
In [19]:
combo['target_decoy'] = combo.apply(lambda row: f'{row.database_closedsearch}->{row.database_opensearch}', axis=1)
combo.target_decoy.value_counts()

VAR, VAL = 'target_decoy','by-intensity-pattern-correlation'
for i,df in combo.groupby(VAR).__iter__():
    _,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
                                df[f'{VAL}_opensearch'])
    effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
    print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')

tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','target_decoy'], 
                           value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'], 
                           var_name='search', value_name='value')

sns.catplot(data=tmp, kind='box',   x='target_decoy', y='value', hue='search', 
            aspect=1.25, showfliers=False, notch=True, palette='Set2', order=['T->T','D->D','D->T','T->D'])
plt.title('Intensity correlation across searches')
plt.xlabel('Target or Decoy hit (closed->open search)')
plt.ylabel('by-intensity-pattern-correlation')
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-2-{filtering}.svg")
D->D: p=2.6e-02 True (n=179) effect size = 0.200
D->T: p=5.5e-43 True (n=808) effect size = 0.661
T->D: p=2.7e-01 False (n=745) effect size = 0.070
T->T: p=1.6e-09 True (n=182926) effect size = 0.019
No description has been provided for this image
In [20]:
combo['canon_before_after'] = combo.apply(lambda row: f'{row.isCanonical_closedsearch}->{row.isCanonical_opensearch}', axis=1)
# print(combo.canon_before_after.value_counts(sort=False))
ORDER = [
    'Canonical->Canonical',
    'NonCanonical->NonCanonical',
    'Canonical->NonCanonical',
    'NonCanonical->Canonical',
    
]

VAR, VAL =  'canon_before_after', 'by-intensity-pattern-correlation',

for i,df in combo.groupby(VAR).__iter__():
    _,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
                                df[f'{VAL}_opensearch'])
    effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
    print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')

tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file',VAR], 
                       value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'], 
                       var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x=VAR, y='value', hue='search', showfliers=False,
            notch=True, palette='Set2', order=ORDER
            # aspect=2
           )
plt.xlabel('Canonical protein hit (closed->open search)')
plt.xticks(rotation=90)
plt.ylabel(VAL)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-3-{filtering}.svg")
Canonical->Canonical: p=2.6e-07 True (n=180326) effect size = 0.015
Canonical->NonCanonical: p=2.0e-03 True (n=2257) effect size = 0.066
NonCanonical->Canonical: p=3.7e-86 True (n=1137) effect size = 0.855
NonCanonical->NonCanonical: p=1.1e-01 False (n=938) effect size = 0.103
No description has been provided for this image

save¶

In [ ]:
# Auto save & export notebook to html!!
autosave(extra_labels='-'+filtering)
filtering